library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
upstream_version = '2024-05-21'
reduced_version = '2024-06-03'
After trying the aggressive feature reduction, I’ll try something in between. The arNN of the first pass of reduced features performed more poorly than yesterday-is-today, and it might be because we’ve dropped parameters that are important (even if not as important as those listed in the SHAP). Also, I noticed a few redundant features that we should eliminate from the dataset.
We’ll keep the inflow data this time, but reduce the redundant features in the same way.
t2022 <- read_csv('data/NN_train_val_test/SMR_autoNN_daily/trainval_t2022_v2024-05-09.csv')
## Rows: 761 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t2022_reduce <- t2022
Let’s look at some correlation plots to determine if we should toss any redundant features, first by data group, since that’s likely where we’ll see some redundancy.
col_names = names(t2022_reduce)
yesterday <- t2022_reduce %>%
select(date, all_of(col_names[grepl("y_", col_names)]))
ggpairs(yesterday, columns = 2:ncol(yesterday))
Max 1 and max 0-5 are the same (which makes sense)
yesterday_reduced <- yesterday %>%
select(-y_max_1m_temp_degC)
ggpairs(yesterday_reduced, columns = 2:ncol(yesterday_reduced))
We could probably reduce this further, so let’s do that by dropping all the 1m summaries
yesterday_reduced <- yesterday_reduced %>%
select(-c(y_min_1m_temp_degC, y_mean_1m_temp_degC))
ggpairs(yesterday_reduced, columns = 2:ncol(yesterday_reduced))
temp <- t2022_reduce %>%
select(date, all_of(col_names[grepl('temp', col_names)])) %>%
select(-all_of(col_names[grepl('1m|5m', col_names)]))
ggpairs(temp, columns = 2:ncol(temp))
These are all < 0.9 correlation, so let’s leave it for now.
solrad <- t2022_reduce %>%
select(date, all_of(col_names[grepl('sol', col_names)]))
ggpairs(solrad, columns = 2:ncol(solrad))
These look good too.
pump <- t2022_reduce %>%
select(date, all_of(col_names[grepl('pump', col_names)]))
ggpairs(pump, columns = 2:ncol(pump))
Oh, right - there is a linear relationship between the average and the sum over a designated time period. We can toss the mean in favor of the sum.
pump_reduced <- pump %>%
select(-c(mean_pump_q_p2, mean_pump_q_p7))
ggpairs(pump_reduced, columns = 2:ncol(pump_reduced))
wind <- t2022_reduce %>%
select(date, all_of(col_names[grepl('wind', col_names)]))
ggpairs(wind, columns = 2:ncol(wind))
Min wind 3/5, Max wind 3/5, Min 10/5 - we can drop max/min 5 and solve these redundancies.
wind_reduced <- wind %>%
select(-c(max_wind_mps_5, min_wind_mps_5))
ggpairs(wind_reduced, columns = 2:ncol(wind_reduced))
precip <- t2022_reduce %>%
select(date, all_of(col_names[grepl('precip', col_names)]))
ggpairs(precip, columns = 2:ncol(precip))
Looks good. So dry.
NF <- t2022_reduce %>%
select(date, all_of(col_names[grepl('NF', col_names)]))
ggpairs(NF, columns = 2:ncol(NF))
Similar issue here where there are sum and averages that are 1:1, also dropping the other p2 parameters since they seem to have high correlation with minus 1 and minus 2 day values.
NF_reduced <- NF %>%
select(-c(sum_NF_q_p2, sum_NF_q_p7, max_NF_q_p2, mean_NF_q_p2))
ggpairs(NF_reduced, columns = 2:ncol(NF_reduced))
chip <- t2022_reduce %>%
select(date, all_of(col_names[grepl('chip', col_names)]))
ggpairs(chip, columns = 2:ncol(chip))
Remove the sum/average dupes
chip_reduced <- chip %>%
select(-c(sum_chip_q_p2, sum_chip_q_p7, max_chip_q_p2, mean_chip_q_p2))
ggpairs(chip_reduced, columns = 2:ncol(chip_reduced))
# collate data
t2022_reduced <- reduce(list(yesterday_reduced, pump_reduced, temp, wind_reduced, solrad, precip, NF_reduced, chip_reduced),
full_join)
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
reduce_names = names(t2022_reduced)
# add back in the labels
reduce_names = append(reduce_names, c("mean_1m_temp_degC", "mean_0_5m_temp_degC"))
years = seq(2014, 2021)
og = 'data/NN_train_val_test/SMR_autoNN_daily'
fp = 'data/NN_train_val_test/SMR_autoNN_reduce_2'
map(.x = years,
.f = ~ {
validation <- read_csv(file.path(og,
paste0('validation_t2022_',
.x,
'_v',
upstream_version,
'.csv'))) %>%
select(all_of(reduce_names))
write_csv(validation,
file.path(fp,
paste0("validation_t2022_",
.x,
"_reduced_v",
reduced_version,
".csv")))
training <- read_csv(file.path(og,
paste0('training_t2022_',
.x,
'_v',
upstream_version,
'.csv'))) %>%
select(all_of(reduce_names))
write_csv(training,
file.path(fp,
paste0("training_t2022_",
.x,
"_reduced_v",
reduced_version,
".csv")))
})
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 667 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 32 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 709 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 62 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 680 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 68 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 643 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 633 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [[1]]
## # A tibble: 665 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2017-05-11 00:00:00 -2.91 -2.16
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[2]]
## # A tibble: 665 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[3]]
## # A tibble: 667 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 657 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[4]]
## # A tibble: 665 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2014-06-12 00:00:00 -2.40 -2.21
## 2 2016-09-25 00:00:00 -2.40 -1.56
## 3 2016-09-24 00:00:00 -1.84 -1.51
## 4 2016-09-27 00:00:00 -1.70 -1.77
## 5 2021-05-31 00:00:00 -1.98 -1.34
## 6 2016-09-26 00:00:00 -2.11 -1.89
## 7 2020-09-11 00:00:00 -1.88 -0.702
## 8 2020-05-25 00:00:00 -1.74 -1.91
## 9 2016-09-29 00:00:00 -1.59 -1.73
## 10 2021-06-01 00:00:00 -2.24 -1.55
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[5]]
## # A tibble: 709 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 699 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[6]]
## # A tibble: 680 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 670 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[7]]
## # A tibble: 643 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 633 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
##
## [[8]]
## # A tibble: 633 × 55
## date y_max_0_5m_temp_degC y_min_0_5m_temp_degC
## <dttm> <dbl> <dbl>
## 1 2017-05-23 00:00:00 -2.77 -3.09
## 2 2017-05-24 00:00:00 -3.12 -2.89
## 3 2017-05-21 00:00:00 -2.81 -3.35
## 4 2017-05-20 00:00:00 -2.47 -2.87
## 5 2017-05-25 00:00:00 -2.88 -2.53
## 6 2017-05-22 00:00:00 -2.83 -3.17
## 7 2017-05-26 00:00:00 -2.80 -1.53
## 8 2017-05-27 00:00:00 -2.28 -1.57
## 9 2017-05-19 00:00:00 -2.82 -1.79
## 10 2014-06-12 00:00:00 -2.40 -2.21
## # ℹ 623 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## # pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## # sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## # mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## # mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## # mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
And also subset the test set
test <- read_csv(file.path(og, paste0('t2022_standardized_v',
upstream_version,
'.csv')))
## Rows: 129 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_reduced <- test %>%
select(reduce_names)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(reduce_names)
##
## # Now:
## data %>% select(all_of(reduce_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
write_csv(test_reduced, file.path(fp, paste0('t2022_reduced_standardized_v', reduced_version, '.csv')))